import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import pickle
from decision_company import read_csv_file, create_dataframe, df_copy, concatenate_objects, bind_dataframe, aggregate_grouped_data, positive_infinity, make_bins, join_dataframes, create_figure, create_barplot, set_plot_title, set_yaxis_label, show_plots, join_dataframes, anova_test, save_plot

atp_tennis = read_csv_file('atp_tennis.csv')

# Create a new DataFrame with separate rows for each player
player_data = create_dataframe(['Player', 'Rank', 'Win_Loss_Ratio', 'Surface'])

# Add Player_1 data to the new DataFrame
player_data_1 = df_copy(atp_tennis[['Player_1', 'Rank_1', 'Win_Loss_Ratio_1', 'Surface']])
player_data_1.columns = ['Player', 'Rank', 'Win_Loss_Ratio', 'Surface']

# Add Player_2 data to the new DataFrame
player_data_2 = df_copy(atp_tennis[['Player_2', 'Rank_2', 'Win_Loss_Ratio_2', 'Surface']])
player_data_2.columns = ['Player', 'Rank', 'Win_Loss_Ratio', 'Surface']

# Concatenate Player_1 and Player_2 data
player_data = concatenate_objects(player_data_1, player_data_2, ignore_index=True)

# Group the DataFrame by player and compute the average rank and win/loss ratio for each unique player
grouped_player_data = bind_dataframe(player_data, 'Player')
agg_dict = {'Rank': 'mean', 'Win_Loss_Ratio': 'mean', 'Surface': 'count'}
unique_player_data = aggregate_grouped_data(grouped_player_data, agg_dict)
unique_player_data.columns = ['Player', 'Avg_Rank', 'Avg_Win_Loss_Ratio', 'Match_Count']

# Define ranking groups based on average rank
bins = [0, 50, 200, positive_infinity()]
labels = ['Top-ranked', 'Mid-ranked', 'Low-ranked']
unique_player_data['Rank_Group'] = make_bins(unique_player_data['Avg_Rank'], bins=bins, labels=labels)

# Calculate the average win/loss ratio for each ranking group and surface type
grouped_data = join_dataframes(player_data, unique_player_data[['Player', 'Rank_Group']], on='Player')
grouped_data = bind_dataframe(grouped_data, ['Rank_Group', 'Surface'])
agg_dict = {'Win_Loss_Ratio': 'mean'}
grouped_data = aggregate_grouped_data(grouped_data, agg_dict)

# Create a bar chart comparing win/loss ratios across ranking groups and surface types
create_figure(figsize=(12, 6))
create_barplot(x='Surface', y='Win_Loss_Ratio', hue='Rank_Group', data=grouped_data)
set_plot_title('Win/Loss Ratios by Surface Type and Ranking Group')
set_yaxis_label('Average Win/Loss Ratio')
save_plot("./ref_result/barplot.png")
show_plots()

# Perform statistical tests for each ranking group
anova_results = {}
for group in ['Top-ranked', 'Mid-ranked', 'Low-ranked']:
    group_data = join_dataframes(player_data, unique_player_data[unique_player_data['Rank_Group'] == group][['Player', 'Rank_Group']], on='Player')
    hard_data = group_data['Win_Loss_Ratio'][group_data['Surface'] == 'Hard'].dropna()
    clay_data = group_data['Win_Loss_Ratio'][group_data['Surface'] == 'Clay'].dropna()
    grass_data = group_data['Win_Loss_Ratio'][group_data['Surface'] == 'Grass'].dropna()

    if len(hard_data) > 0 and len(clay_data) > 0 and len(grass_data) > 0:
        anova_result = anova_test(hard_data, clay_data, grass_data)
        anova_results[group] = anova_result.pvalue
    else:
        anova_results[group] = None

# Print the statistical test results (P-values) for each ranking group
print("Statistical Test Results (P-values):")
for group, pvalue in anova_results.items():
    print(f"{group}: {pvalue}")

print(anova_results)
pickle.dump(anova_results,open("./ref_result/anova_results.pkl","wb"))